Table of contents

  1. Tidy Data

  2. Real World

  3. Medicare Advantage

class: inverse, center, middle name: tidy

Tidy Data


What is Tidy data?

Resources: - Paper: Tidy Data (Hadley Wickham, 2014 JSS) - Vignette: Tidy data (from the tidyr package)



Essentially: 1. Variables are columns 2. Observations are rows 3. Variables and observations make a table

Intro to Tidy data

Let’s load the tidyverse package and check the output:

library(tidyverse)

Comes with lots of other packages like ggplot2, tibble, dplyr, etc.

Pipes: %>%

Helps to break the pipes over several lines

mpg %>% 
  filter(manufacturer=="audi") %>% 
  group_by(model) %>% 
  summarise(hwy_mean = mean(hwy))
## # A tibble: 3 x 2
##   model      hwy_mean
##   <chr>         <dbl>
## 1 a4             28.3
## 2 a4 quattro     25.8
## 3 a6 quattro     24

1) dplyr::filter()

Multiple filters separated by commas:

starwars %>% 
  filter( 
    species == "Human", 
    height >= 190
    ) %>% head(5)
## # A tibble: 4 x 13
##   name                height  mass hair_color skin_color eye_color birth_year gender homeworld species films     vehicles  starships
##   <chr>                <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr>  <chr>     <chr>   <list>    <list>    <list>   
## 1 Darth Vader            202   136 none       white      yellow          41.9 male   Tatooine  Human   <chr [4]> <chr [0]> <chr [1]>
## 2 Qui-Gon Jinn           193    89 brown      fair       blue            92   male   <NA>      Human   <chr [1]> <chr [1]> <chr [0]>
## 3 Dooku                  193    80 white      fair       brown          102   male   Serenno   Human   <chr [2]> <chr [1]> <chr [0]>
## 4 Bail Prestor Organa    191    NA black      tan        brown           67   male   Alderaan  Human   <chr [2]> <chr [0]> <chr [0]>

2) dplyr::arrange()

Arrange in ascending order:

starwars %>% 
  arrange(birth_year) %>% head(5)
## # A tibble: 5 x 13
##   name                  height  mass hair_color skin_color eye_color birth_year gender homeworld species films     vehicles  starships
##   <chr>                  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr>  <chr>     <chr>   <list>    <list>    <list>   
## 1 Wicket Systri Warrick     88    20 brown      brown      brown              8 male   Endor     Ewok    <chr [1]> <chr [0]> <chr [0]>
## 2 IG-88                    200   140 none       metal      red               15 none   <NA>      Droid   <chr [1]> <chr [0]> <chr [0]>
## 3 Luke Skywalker           172    77 blond      fair       blue              19 male   Tatooine  Human   <chr [5]> <chr [2]> <chr [2]>
## 4 Leia Organa              150    49 brown      light      brown             19 female Alderaan  Human   <chr [5]> <chr [1]> <chr [0]>
## 5 Wedge Antilles           170    77 brown      fair       hazel             21 male   Corellia  Human   <chr [3]> <chr [1]> <chr [1]>

3) dplyr::select()

3) dplyr::select()

Rename within select():

starwars %>%
  select(alias=name, crib=homeworld, sex=gender)  %>% head(5)
## # A tibble: 5 x 3
##   alias          crib     sex   
##   <chr>          <chr>    <chr> 
## 1 Luke Skywalker Tatooine male  
## 2 C-3PO          Tatooine <NA>  
## 3 R2-D2          Naboo    <NA>  
## 4 Darth Vader    Tatooine male  
## 5 Leia Organa    Alderaan female

4) dplyr::mutate()

Create new variables with mutate()

starwars %>% 
  select(name, birth_year) %>%
  mutate(dog_years = birth_year * 7) %>%
  mutate(comment = paste0(name, " is ", dog_years, " in dog years."))  %>% head(5)
## # A tibble: 5 x 4
##   name           birth_year dog_years comment                            
##   <chr>               <dbl>     <dbl> <chr>                              
## 1 Luke Skywalker       19        133  Luke Skywalker is 133 in dog years.
## 2 C-3PO               112        784  C-3PO is 784 in dog years.         
## 3 R2-D2                33        231  R2-D2 is 231 in dog years.         
## 4 Darth Vader          41.9      293. Darth Vader is 293.3 in dog years. 
## 5 Leia Organa          19        133  Leia Organa is 133 in dog years.

4) dplyr::mutate()

Other handy ways to use mutate():

starwars %>% 
  select(name, height) %>%
  filter(name %in% c("Luke Skywalker", "Anakin Skywalker")) %>% 
  mutate(tall1 = height > 180) %>%
  mutate(tall2 = ifelse(height > 180, "Tall", "Short")) ## Same effect, but can choose labels
## # A tibble: 2 x 4
##   name             height tall1 tall2
##   <chr>             <int> <lgl> <chr>
## 1 Luke Skywalker      172 FALSE Short
## 2 Anakin Skywalker    188 TRUE  Tall

5) dplyr::summarise() with group_by()

starwars %>% 
  group_by(species, gender) %>% 
  summarise(mean_height = mean(height, na.rm = T)) %>% head(5)
## # A tibble: 5 x 3
## # Groups:   species [5]
##   species  gender mean_height
##   <chr>    <chr>        <dbl>
## 1 Aleena   male            79
## 2 Besalisk male           198
## 3 Cerean   male           198
## 4 Chagrian male           196
## 5 Clawdite female         168
Note: na.rm = T is usually a good idea, otherwise your summary will be NA too.
# 5) dplyr::summarise()
“scoped” variants also work with summarise() - summarise_all() affects every variable - summarise_at() affects named or selected variables - summarise_if() affects variables that meet some criteria (e.g. are numeric)
starwars %>% group_by(species, gender) %>% summarise_if(is.numeric, list(avg=mean), na.rm=T) %>% head(5)
## # A tibble: 5 x 5
## # Groups:   species [5]
##   species  gender height_avg mass_avg birth_year_avg
##   <chr>    <chr>       <dbl>    <dbl>          <dbl>
## 1 Aleena   male           79       15            NaN
## 2 Besalisk male          198      102            NaN
## 3 Cerean   male          198       82             92
## 4 Chagrian male          196      NaN            NaN
## 5 Clawdite female        168       55            NaN

Joining operations

library(nycflights13)
flights 
planes

Left join

dplyr guessed about which columns to join on (i.e. columns that share the same name). It also told us its choices:


## Joining, by = c("year", "tailnum")

Problem: the variable “year” does not have a consistent meaning across our joining datasets! - year of flight versus year of construction
# Left join
Luckily, there’s an easy way to avoid this problem. - See if you can figure it out before turning to the next slide. - Try ?dplyr::join.

Left join

Let’s be more explicit with the by = argument:

left_join(
  flights,
  planes %>% rename(year_built = year), ## Not necessary w/ below line, but helpful
  by = "tailnum" ## Be specific about the joining column
  ) %>%
  select(year, month, day, dep_time, arr_time, carrier, flight, tailnum, year_built, type, model) %>%
  head(5) ## Just to save vertical space on the slide
## # A tibble: 5 x 11
##    year month   day dep_time arr_time carrier flight tailnum year_built type                    model   
##   <int> <int> <int>    <int>    <int> <chr>    <int> <chr>        <int> <chr>                   <chr>   
## 1  2013     1     1      517      830 UA        1545 N14228        1999 Fixed wing multi engine 737-824 
## 2  2013     1     1      533      850 UA        1714 N24211        1998 Fixed wing multi engine 737-824 
## 3  2013     1     1      542      923 AA        1141 N619AA        1990 Fixed wing multi engine 757-223 
## 4  2013     1     1      544     1004 B6         725 N804JB        2012 Fixed wing multi engine A320-232
## 5  2013     1     1      554      812 DL         461 N668DN        1991 Fixed wing multi engine 757-232

Other dplyr goodies

pull(): Extract a column from a data frame as a vector or scalar. - e.g. starwars %>% filter(gender=="female") %>% pull(height)

count() and distinct(): Number and isolate unique observations. - e.g. starwars %>% count(species), or starwars %>% distinct(species) - You could also use a combination of mutate(), group_by(), and n(), e.g. starwars %>% group_by(species) %>% mutate(num = n()).

Some dplyr tips

.center[ :scale 800px]

Key tidyr verbs

  1. pivot_wider() and pivot_longer() to reshape data between wide and long format

  2. separate(): Split one column into multiple columns

  3. unite(): Combine multiple columns into one

1) tidyr::pivot_longer()

tidy_stocks <- stocks %>% 
  pivot_longer(cols=c("X","Y","Z"), 
               names_to="stock", values_to="price")
tidy_stocks
## # A tibble: 6 x 3
##   time       stock  price
##   <date>     <chr>  <dbl>
## 1 2009-01-01 X      1.62 
## 2 2009-01-01 Y      3.19 
## 3 2009-01-01 Z     -0.436
## 4 2009-01-02 X      0.709
## 5 2009-01-02 Y      0.590
## 6 2009-01-02 Z     -2.12

1) tidyr::pivot_wider()

tidy_stocks %>% 
  pivot_wider(values_from="price",names_from="stock")
## # A tibble: 2 x 4
##   time           X     Y      Z
##   <date>     <dbl> <dbl>  <dbl>
## 1 2009-01-01 1.62  3.19  -0.436
## 2 2009-01-02 0.709 0.590 -2.12

2) tidyr::separate()

economists %>% separate(name, c("first_name", "last_name")) 
## # A tibble: 3 x 2
##   first_name last_name
##   <chr>      <chr>    
## 1 Abhijit    Banerjee 
## 2 Esther     Duflo    
## 3 Michael    Kremer

Should also specify the separation character with separate(..., sep=" ").

3) tidyr::separate_rows()

## Now split out Jill's various occupations into different rows
jobs %>% separate_rows(occupation)
## # A tibble: 4 x 2
##   name  occupation    
##   <chr> <chr>         
## 1 Jack  Homemaker     
## 2 Jill  Philosopher   
## 3 Jill  Philanthropist
## 4 Jill  Troublemaker

4) tidyr::unite()

## Combine "yr", "mnth", and "dy" into one "date" column
gdp %>% unite(date, c("yr", "mnth", "dy"), sep = "-")
##       date       gdp
## 1 2016-1-1  98.88207
## 2 2016-1-2  98.80390
## 3 2016-1-3 102.44573
## 4 2016-1-4 101.36461

4) tidyr::unite()

library(lubridate)
gdp_u %>% mutate(date = ymd(date))
## # A tibble: 4 x 2
##   date         gdp
##   <date>     <dbl>
## 1 2016-01-01  98.9
## 2 2016-01-02  98.8
## 3 2016-01-03 102. 
## 4 2016-01-04 101.

Summary

.pull-left[ ### dplyr 1. filter() 2. arrange() 3. select() 4. mutate() 5. summarise()] .pull-right[ ### tidyr 1. pivot_longer() 2. pivot_wider() 3. separate() 4. unite()]


Other useful items include: pipes (%>%), grouping (group_by()), joining functions (left_join(), inner_join, etc.).

class: inverse, center, middle name: real_data

Real World


Advice 1: Be patient and careful in your coding

.center[ ]

class: inverse, center, middle name: ma_data

Medicare Advantage


Download the data

First step is to download the raw data that we’ll be using. - Monthly Enrollment - Plan Characteristics - Service Areas

Lots more out there, but this is enough for now.

1) Contract/enrollment info

    ## Clean the contract level data
    contract.info = contract.info %>%
      group_by(contractid, planid) %>%
      mutate(id_count=row_number())
    
    contract.info = contract.info %>%
      filter(id_count==1) %>%
      select(-id_count)

1) Contract/enrollment info

    ## Merge contract info with enrollment info
    plan.data = contract.info %>%
      left_join(enroll.info, by=c("contractid", "planid")) %>%
      mutate(month=as.numeric(m),year=y)
    
    assign(paste0("plan.data.",y,".",m),plan.data)
  }

1) Contract/enrollment info

  ## Fill in missing fips codes (by state and county)
  plan.month = plan.month %>%
    group_by(state, county) %>%
    fill(fips)

  ## Fill in missing plan characteristics by contract and plan id
  plan.month = plan.month %>%
    group_by(contractid, planid) %>%
    fill(plan_type, partd, snp, eghp, plan_name)
  
  ## Fill in missing contract characteristics by contractid
  plan.month = plan.month %>%
    group_by(contractid) %>%
    fill(org_type,org_name,org_marketing_name,parent_org)

Number of plans available

full.ma.data %>% group_by(fips, year) %>% select(fips, year) %>% summarize(plan_count=n()) %>%
  ggplot(aes(x=as.factor(year),y=plan_count)) + 
  stat_summary(fun.y="mean", geom="bar") +
  labs(
    x="Year",
    y="Number of Plans",
    title="Average Number of Plans per County"
  ) + scale_y_continuous(labels=comma) +
  theme_bw()

.plot-callout[ ]

Number of plans available

.left-code[

full.ma.data %>% 
  group_by(fips, year) %>% 
  select(fips, year) %>% 
  summarize(plan_count=n()) %>%
  ggplot(aes(x=as.factor(year),y=plan_count)) + 
  stat_summary(fun.y="mean", geom="bar") +
  labs(
    x="Year",
    y="Number of Plans",
    title="Average Number of Plans per County"
  ) + scale_y_continuous(labels=comma) +
  theme_bw()

]

.right-plot[ ]

Number of plans available

.left-code[

full.ma.data %>% 
  filter(snp=="No" & eghp=="No") %>%
  filter(planid < 800 | planid >= 900) %>% #<<
  filter(!is.na(planid)) %>% #<<
  group_by(fips, year) %>% 
  select(fips, year) %>% 
  summarize(plan_count=n()) %>%
  ggplot(aes(x=as.factor(year),y=plan_count)) + 
  stat_summary(fun.y="mean", geom="bar") +
  labs(
    x="Year",
    y="Number of Plans",
    title="Average Number of Plans per County"
  ) + scale_y_continuous(labels=comma) +
  theme_bw()

]

.right-plot[ ]

Interactive plot